1. Introduction

The purpose of this vignette is to document the process of removing outliers in the data set COS in the environment of R with the aim at exploring different accuraccies that we can achieve in the classification for the year 2017. As part of the supervised classification, besides the imagery for 2017, we account with a training dataset called COS (Carta de uso e Ocupacao do solo) composed by 13000 samples categorized by different LCLU types (1000 samples each type). Since our training dataset differs in date and source of adquisition respect the imagery (labels done by interpretation of aerial imagery), we need to trace the differences between what the labels explain and what actually happens in the ground. Moreover, the dataset contains two labels: descricao and CLASS_NAME. The labels of descrica correspond to the level 5 of COS nomenclature. That is, use of the land cover. The label CLASS_NAME corresponds to a generalization done by the institute DGT. I will play with both labels dependending on how they match with the spectral information of the imagery.

2. Methodology

Therefore, as first attempt to explore the ranges of NDVI per class, we recreated the NDVI for 29 images available for the year 2017 an extract the NDVI values to the 13000 points. Since We expect certain homogeneity in the NDVI values per class and per time, we think that the set of those values that do not follow this trend are probably points that may change for natural or anthropogenic reasons; or simply the label is not enough representation of the area covered by the pixel.

Therefore, to measure the statistical dispersion of NDVI per class and per time we are going to use two methods, interquantile ranges and standart deviations.

library(sf)
## Linking to GEOS 3.6.2, GDAL 2.2.3, PROJ 4.9.3
library(ggplot2)
folder_path = "/home/user/Documents/TESISMASTER/VECTOR/Analysis_outliers/training_samples2.shp"
#Days of the images
day_shoot = c("2017/01/15", "2017/04/05", "2017/05/25", "2017/06/04", "2017/06/14",
              "2017/07/04", "2017/07/09", "2017/07/14", "2017/07/24", "2017/07/29",
              "2017/08/03", "2017/08/08", "2017/08/13", "2017/08/18", "2017/09/02" ,
              "2017/09/07", "2017/09/12", "2017/09/22", "2017/09/27", "2017/10/02",
              "2017/10/12", "2017/10/22", "2017/10/27", "2017/11/11", "2017/11/16",
              "2017/11/21", "2017/12/01","2017/12/16", "2017/12/21")

#converting days characters to time format in R 
day_shoot= strptime(as.character(day_shoot), "%Y/%m/%d")
#reading shapefile
dataset = sf::read_sf(folder_path)

2.1 Method 1: interquantile range analysis

IQR method makes detection of ouliers by looking at values more than one and half times the IQR distance below the first quartil and above the third quartil. IQR analysis is attractive because it considers the median as the measurement of centrality and thus is considered resistant to outliers. The following equations shows how to calculate this range per time for especific land cover type.

\[ IQR^{t} = Q_{3}^{t} - Q_{1}^{t} \] \[ Limits^{t} = Q_{1}^{t} \pm 1.5*IQR^{t} \]

model1 =  function(x, min_x, max_x){
  ind_t = which(x< min_x | x > max_x)
  if(length(ind_t)!=0){
    x[ind_t] = NA  
  }
  qua = quantile(x,na.rm=TRUE)
  q25 =qua[2]
  q75 =qua[4]
  IQR = q75 - q25
  limit_left = q25 - 1.5*IQR
  limit_rigth = q75 + 1.5*IQR
  ind = which(x<= limit_left | x >= limit_rigth | is.na(x))
  return(ind)
}

2.2 Method 2: Standard deviation analysis

And alternative to interquantile range analysis is to explore the values that are at different standard deviations from the mean of the data (here two standard deviations). Standard deviation analysis is also a measurement of dispersion, however, it is problematic for not being resistant to the presence of outliers and, furthermore, it works under the assumption that data must follow certain normality.

\[ Limits^{t} = \bar{x}^{t} \pm n \cdot \sigma ^{t} \]

model2 =  function(x, min_x, max_x){
  ind_t = which(x< min_x | x > max_x)
  if(length(ind_t)!=0){
    x[ind_t] = NA  
  }
  sd_x = sd(x,na.rm = TRUE)
  mean_x = mean(x,na.rm = TRUE)
  limit_left = mean_x - 1*sd_x
  limit_rigth = mean_x + 1*sd_x
  #filter according with the threshold and central tendency criterium
  ind = which(x < limit_left | x > limit_rigth | is.na(x))
  return(ind)
}

2.3 Implementation

Well, basically the next function apply the previous methods and return a suitable dataframe for the use of ggplot2 library.

#THis function remove the possible outliers of the data set, considering different classes and times.
removing_outliers = function(datast,
                               methodo = c("None","model_1", "model_2"), day_shoot,pivot = "CLASS_NAME"){
  #selecting model
  list_out = vector("list",2)
  func_choice <- switch(methodo,
                       'None'='None',
                       'model_1'=model1,
                       'model_2'=model2
                       )
  names_dt = colnames(datast) 
  ind_pivot = which(names_dt == pivot)
  #keeping only name of the pivot
  names_dt2 = c(names_dt[ind_pivot],names_dt[3:length(names_dt)])
  #subsetting columns
  data_st = datast[,names_dt2]
  colnames(data_st) = c("class",names_dt[3:length(names_dt)])
  funct_order = function(x, ind) ind[x]
  st_geometry(data_st) = NULL
  output = vector("list",ncol(data_st)-1)
  names(output) = colnames(data_st[,-1])
  #loop methodology over different classes
  for(i in unique(data_st$class)){
    index = which(data_st$class == i)
    df = data_st[index,-1]
    list_ind = lapply(df,func_choice, min_x = -1, max_x = 1)
    list_ind2 = lapply(list_ind, funct_order, ind= index)
    for(j in 1:length(list_ind2)){
      output[[j]] = c(list_ind2[[j]], output[[j]])
    }
  }
  #creating new data frame
  list_df = vector("list", length(output))
  names(list_df) = names(output)
  for(k in 1:length(output)){
    nu = output[[k]]
    if(length(nu) == 0){
      list_df[[k]] = datast[,c(pivot,names(output)[k])]
    }else{
      list_df[[k]] = datast[-nu,c(pivot,names(output)[k])]
    }
  }
  names(list_df) = day_shoot
  return(list_df) 
}  
  

#============================
#adapting results to ggplot
#============================

dataframe_ggplot = function(x){
  #selecting column
  selecting_column = function(x, class_number){
    names(x)[class_number] = "selection"
    return(x$selection)
  }
  #building data frame
  df_class = lapply(x, selecting_column, 1)
  df_ndvi = lapply(x, selecting_column, 2)
  column_class = reshape2::melt(df_class)
  df_gp = cbind(column_class, unlist(df_ndvi)) 
  colnames(df_gp) = c("class","time","NDVI")
  return(df_gp)
}


#fitting a data frame with ndvi columns to one data frame suitable to be plot for ggplot
func_median = function(x) median(x, na.rm = TRUE)
func = function(x) x
trajectories_boxplot = function(df,day_shoot){
  st_geometry(df) = NULL
  df_names = colnames(df)
  colnames(df) = c("category",df_names[-1])
  output= NULL
  for(i in unique(df$category)){
    index = which(df$category == i)
    df_list = lapply(df[index,-1], func)
    names(df_list) = day_shoot
    df0 = reshape2::melt(df_list)
    df1 = cbind(df0,i)
    output = rbind(output , df1)
  }
  colnames(output) = c("NDVI","time","class")
  return(output)
}

3. Results

I will go through all the classes analysing the ndvi values in order to set which samples must be out of the training data.

3.1 Forest

In the following graphic we can see the dispersion of ndvi for three different classes of trees per image over the year 2017. The classes of coniferos and eucaliptus depict similar variance and range of ndvi over the year. Instead, Holm and cork trees depict lower variances and a lower ranges of ndvi. Besides that, we can visualize the presence of possible anomaly data according with the black points located beyond the whiskers of the box.

library(reshape2)
query_dataset_trees = dataset[dataset$CLASS_NAME %in% c('Holm_and_Cork_Trees',
                                                   'Eucalyptus_trees',
                                                   'Coniferous_trees') 
                                                   ,c(-1)]

trajectories_trees  = trajectories_boxplot(query_dataset_trees, day_shoot)
trajectories_trees$class = factor(trajectories_trees$class,  labels = c('Holm_and_Cork_Trees', 'Eucalyptus_trees', 'Coniferous_trees'))

ggplot(trajectories_trees, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

Since I want to evaluate the impact of the presence of the anomaly data in the final accuraccy of the classification, I will remove those outliers assuming the two methodologies proposed above.

3.1.1 Interquantile range

query_dataset_trees = dataset[dataset$CLASS_NAME %in% c('Holm_and_Cork_Trees',
                                                   'Eucalyptus_trees',
                                                   'Coniferous_trees')
                                                   ,]
trees_m1 = removing_outliers(query_dataset_trees, methodo = c("model_1"),day_shoot)
df_trees = dataframe_ggplot(trees_m1)
ggplot(df_trees, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.1.2 Interquantile range

Although the previous graphic looks similar, I only want to emphasize that in the attempt of removing outliers by implementing standard deviation we will have probably less values than implementing the interquantile range. For example, using the previos technique we reduce the sample of forest in total to 10% and in the case of the satandar deviation to 15%. I will inclinate for the methodology that is more resistant to outliers and also avoid a loss of too much information.

trees_m2 = removing_outliers(query_dataset_trees, methodo = c("model_2"),day_shoot)
df_trees = dataframe_ggplot(trees_m2)
ggplot(df_trees, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.1.3 Merging classes for forest

Since the range of the espectral variation over the time of these three classes are particularly the same, we may expect high confusion in their classification and therefore a multitemporal analysis per pixel not the right answer to separete this sort of classes. In this sense, since I already try to perform classification using the three classes separetely, not having good results of course, I want to join in this opportunity the classes. THerefore, I will select randomly 1000 samples that falls within a margen of 0.3 and 1. THis range came from some researchs where it says that vegetation as trees must have values larger than 0.3

remove_values_forest = function(x){
  colnames(x) = c("CLASS_NAME","NDVI","geometry")
  ind1 = which(x$NDVI >= 0.3)
  ind2 = sample(length(ind1),1000)
  x1 = x[ind1[ind2],]
  x1$CLASS_NAME = "Forest"
  return(x1)
}
query_trees_tr = lapply(trees_m2,remove_values_forest)
df_trees_plot = dataframe_ggplot(query_trees_tr)
ggplot(df_trees_plot, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()

3.2 Shrubs and herbaceus vegetation asociation

as part also of the forest and seminatural areas we have

library(reshape2)
query_dataset_veg_asociation = dataset[dataset$CLASS_NAME %in% c("Bushes_and_shrubs","Natural_Herbaceous" ) 
                                                   ,c(-1)]
trajectories_veg_association  = trajectories_boxplot(query_dataset_veg_asociation, day_shoot)
ggplot(trajectories_veg_association , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.2.1 Interquantile range

query_dataset_veg_asociation = dataset[dataset$CLASS_NAME %in% c("Bushes_and_shrubs","Natural_Herbaceous" ) 
                                                   ,]
veg_asociation_m1 = removing_outliers(query_dataset_veg_asociation, methodo = c("model_1"),day_shoot)
df_veg_asociation = dataframe_ggplot(veg_asociation_m1)
ggplot(df_veg_asociation, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.2.2 Interquantile range

veg_asociation_m2 = removing_outliers(query_dataset_veg_asociation, methodo = c("model_2"),day_shoot)
df_veg_asociation = dataframe_ggplot(veg_asociation_m2)
ggplot(df_veg_asociation, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.2.3 Summary

#function to count labels after performing cleaning data
func_barchar = function(w){
  count_labels = function(x){
    colnames(x) = c("class", colnames(x)[2:3])
    st_geometry(x) = NULL
    sp1 = split(x,x$class)
    x1 = lapply(sp1, nrow)
    return(melt(x1))
  }
  list_count = lapply(w, count_labels)
  output = NULL
  for(i in 1:length(list_count)){
    d = data.frame(list_count[[i]],time= names(list_count[i]))
    output = rbind(output, d) 
  }
  return(output)
}

out_barchar = func_barchar(veg_asociation_m1)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

#graphic with the second cleaning technique
out_barchar = func_barchar(veg_asociation_m2)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

### 3.2.3 Merging classes for Shrubs and herbaceus vegetation asociation

AS I did above with forest, I will seelect randomly 1000 samples per date and create a new object with the samples for Shrubs and herbaceus vegetation asociation

remove_values_veg_asociation = function(x){
  colnames(x) = c("CLASS_NAME","NDVI","geometry")
  ind = sample(length(x$CLASS_NAME),1000)
  x1 = x[ind,]
  x1$CLASS_NAME = "Shrubs"
  return(x1)
}
query2_veg_asociation = lapply(veg_asociation_m2,remove_values_veg_asociation)
df_veg_asociation = dataframe_ggplot(query2_veg_asociation)
ggplot(df_veg_asociation , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()

3.2.4 Vegetation over the time

In this attempt I want to convert labels associated with natural herbaceus to non vegetation in the case they have lower ndvi values than 0.3. Therefore, I will implement the following:

convert_no_vegetated = function(x){
  colnames(x) = c("class","NDVI","geometry")
  ind = which(x$NDVI < 0.3)
  x[ind,"class"] = "Non_vegetated"
  return(x)
}
query3_veg_asociation = lapply(query2_veg_asociation,convert_no_vegetated)

3.3 Rice fields and Herbaceous permanent

I want to highligth with the next graphic the strong differences between herbaceus permanent and rice fields trajectories of NDVI. The notable espectral difference is because rice fields is part of temporal crops that need of irrigation after the sowing stage. In this sense, we expect this class differs from the herbaceus permanent due to the moisture treatment over the crops in the period of summer.

query_dataset_agricultural = dataset[dataset$CLASS_NAME %in% c('Rice_fields',
                                                   'Herbaceous_permanet') 
                                                   ,c(-1)]

trajectories_agricultural  = trajectories_boxplot(query_dataset_agricultural, day_shoot)

ggplot(trajectories_agricultural, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.3.1 Interquantile range

query_dataset_agricultural = dataset[dataset$CLASS_NAME %in% c('Rice_fields',
                                                   'Herbaceous_permanet')
                                                   ,]
agricultural_m1 = removing_outliers(query_dataset_agricultural, methodo = c("model_1"),day_shoot)
df_agricultural = dataframe_ggplot(agricultural_m1)
ggplot(df_agricultural, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.3.2 Standard deviation

query_dataset_agricultural = dataset[dataset$CLASS_NAME %in% c('Rice_fields',
                                                   'Herbaceous_permanet') 
                                                   ,]
agricultural_m2 = removing_outliers(query_dataset_agricultural, methodo = c("model_2"),day_shoot)
df_agricultural = dataframe_ggplot(agricultural_m2)
ggplot(df_agricultural, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

getting rid of ndvi values lower than 0

remove_values_agriculture = function(x){
  colnames(x) = c("CLASS_NAME","NDVI","geometry")
  ind1 = which(x$NDVI >= 0)
  x1 = x[ind1,]
  return(x1)
}
query_agricultural_tr = lapply(agricultural_m1,remove_values_agriculture)
df_agricultural_plot = dataframe_ggplot(query_agricultural_tr)
ggplot(df_agricultural_plot, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.3.3 Summary

#graphic with the second cleaning technique
out_barchar = func_barchar(query_agricultural_tr)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

3.4 Herbaceous Periodic

This class is composed by two signals, one that correspond to irrigated crops and onother to non- irrigated.

query_dataset_temporalcrops = dataset[dataset$CLASS_NAME %in% c("Herbaceous_periodic") 
                                                   ,c(-2)]
trajectories_temporalcrops  = trajectories_boxplot(query_dataset_temporalcrops, day_shoot)
ggplot(trajectories_temporalcrops, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.4.1 Interquantile range

query_dataset_temporalcrops  = dataset[dataset$CLASS_NAME %in%  c("Herbaceous_periodic"),]
temporalcrops_m1 = removing_outliers(query_dataset_temporalcrops , methodo = c("model_1"),day_shoot,pivot = "DESCRICAO")
df_temporalcrops = dataframe_ggplot(temporalcrops_m1)
ggplot(df_temporalcrops, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.4.2 Standart deviation

temporalcrops_m2 = removing_outliers(query_dataset_temporalcrops , methodo = c("model_2"),day_shoot,pivot = "DESCRICAO")
df_temporalcrops = dataframe_ggplot(temporalcrops_m2)
ggplot(df_temporalcrops, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.4.3 Summary

#graphic with the second cleaning technique
out_barchar = func_barchar(temporalcrops_m2)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

3.2.5 Herbaceous periodic to non vegetation

In this attempt I want to convert labels associated with natural herbaceus to non vegetation in the case they have lower ndvi values than 0.3. Therefore, I will implement the following:

convert_no_vegetated2 = function(x){
  colnames(x) = c("CLASS_NAME","NDVI","geometry")
  ind = which(x$NDVI < 0.3)
  x$CLASS_NAME = "Herbaceous_periodic"
  x[ind,"CLASS_NAME"] = "Non_vegetated"
  return(x)
}

temporalcrops_m3 = lapply(temporalcrops_m2,convert_no_vegetated2)

3.5 Non - vegetated

Non-vegetated class tend to depict near infrared reflectance somehow larger than the red. This sligh difference turns out in low positive values of NDVI. According with the literature (cite) we may have this classs in the range of 0 and 0.3 ndvi. Let’s have a look of the NDVI signal over the year 2017 and evaluate what percentage of the information meets this criterion.

query_dataset_no_vegetated = dataset[dataset$CLASS_NAME %in% c("Non_vegetated") 
                                                   ,]
table_nv = table(query_dataset_no_vegetated$DESCRICAO)
df_nv = data.frame(table_nv, pos= as.numeric(table_nv)*0.5)
ggplot(data=df_nv , aes(x=Var1, y=Freq)) +
  coord_flip() +
  geom_bar(stat="identity", fill=c("firebrick2","steelblue","olivedrab4")) +
  geom_text(aes(label = Freq, y= pos))

query_dataset_no_vegetated = dataset[dataset$CLASS_NAME %in% c("Non_vegetated") 
                                                   ,c(-2)]
trajectories_no_vegetated  = trajectories_boxplot(query_dataset_no_vegetated , day_shoot)
ggplot(trajectories_no_vegetated, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.5.1 interquantile range

since sparse vegetation and bare rocks depic ndvi values larger than cero I will take them out of the sampling.

query_dataset_no_vegetated = dataset[dataset$DESCRICAO %in% c("3.3.1.01.1 Praias, dunas e areais interiores") 
                                                   ,]
non_vegetated_m1 = removing_outliers(query_dataset_no_vegetated , methodo = c("model_1"),day_shoot,pivot = "DESCRICAO")
df_non_vegetated_m1 = dataframe_ggplot(non_vegetated_m1)
ggplot(df_non_vegetated_m1, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

### 3.5.2 Summary

change_name_class = function(x) {
  x$DESCRICAO = "Non_vegetated"
  return(x)
  }
non_vegetated_m2 = lapply(non_vegetated_m1, change_name_class)
#graphic with the second cleaning technique
out_barchar = func_barchar(non_vegetated_m2)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

3.6 Water

Class water is composed for several categories, I want to explore how many samples we have per class using as reference the 1000 we have.

dataset_water = dataset[dataset$CLASS_NAME == "Water" ,] 
table_water = table(dataset_water$DESCRICAO)
df_water = data.frame(table_water, pos= as.numeric(table_water)*0.5)
ggplot(data=df_water , aes(x=Var1, y=Freq)) +
  coord_flip() +
  geom_bar(stat="identity", fill="steelblue") +
  geom_text(aes(label = Freq, y= pos))

Since the spectral response defined by the pixel obeys to the predominant land cover type around the point label and not necessarily to the type which the point is representing, it is advisable to get rid of those points of water where I pressume the water can be hardly identified for this sensor due to their size. In this sense, I will take the following categories out: charcas (puddle), reservatorios de represas (dams), lagos interiores naturais (inland natural lakes), lagos and lagoeas interiores artificiais (inland artificial lakes) and canais artificias (artificial channels).

Let’s explore how the rest of categories looks like.

query_dataset_water = dataset[dataset$DESCRICAO %in% c("5.2.2.01.1 Desembocaduras fluviais" , "5.1.1.01.1 Cursos de água naturais", "5.1.2.02.1 Reservatórios de barragens" ),-1]

trajectories_water  = trajectories_boxplot(query_dataset_water, day_shoot)
ggplot(trajectories_water, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.6.1 standard deviation

query_dataset_water = dataset[dataset$DESCRICAO %in% c("5.2.2.01.1 Desembocaduras fluviais" , "5.1.1.01.1 Cursos de água naturais", "5.1.2.02.1 Reservatórios de barragens" ),]
water_m2 = removing_outliers(query_dataset_water , methodo = c("model_2"),day_shoot)
df_water_m2 = dataframe_ggplot(water_m2)
ggplot(df_water_m2, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.6.2 Summary

#graphic with the second cleaning technique
out_barchar = func_barchar(water_m2)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

3.7 Sealed

Here class sealed is composed for different ccategories. Let’s have a look of how many samples we have per category.

dataset_sealed = dataset[dataset$CLASS_NAME == "Sealed" , ]
table_sealed = table(dataset_sealed$DESCRICAO)
df_sealed = data.frame(table_sealed , pos= as.numeric(table_sealed)*0.5)
ggplot(data=df_sealed , aes(x=Var1, y=Freq)) +
  coord_flip() +
  geom_bar(stat="identity", fill="steelblue") +
  geom_text(aes(label = Freq, y= pos))

let’s see the trajectories

query_dataset_sealed = dataset[dataset$DESCRICAO %in% c("1.2.2.01.1 Rede viária e espaços associados",
                                                        "1.2.1.01.1 Indústria",
                                                        "1.1.1.02.1 Tecido urbano contínuo predominantemente horizontal"),-2]

trajectories_sealed  = trajectories_boxplot(query_dataset_sealed , day_shoot)
ggplot(trajectories_sealed, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.7.1 Standard deviation

query_dataset_sealed = dataset[dataset$DESCRICAO %in% c("1.2.2.01.1 Rede viária e espaços associados",
                                                        "1.2.1.01.1 Indústria",
                                                        "1.1.1.02.1 Tecido urbano contínuo predominantemente horizontal"),]
sealed_m2 = removing_outliers(query_dataset_sealed , methodo = c("model_2"),day_shoot)
df_sealed_m2 = dataframe_ggplot(sealed_m2)
ggplot(df_sealed_m2, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.7.1 Summary

#graphic with the second cleaning technique
out_barchar = func_barchar(sealed_m2)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

3.8 Wetlands

As it was done with water, I will keep only the time series which values at least one negative. This condition will allow me to leave samples that over time changed by other land cover type no intersected by water.

dataset_wetlands = dataset[dataset$CLASS_NAME == "Wetlands",-1]
trajectories_wetlands  = trajectories_boxplot(dataset_wetlands , day_shoot)
ggplot(trajectories_wetlands , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.8.1 interquantile range

dataset_wetlands = dataset[dataset$CLASS_NAME == "Wetlands",]
wetlands_m1 = removing_outliers(dataset_wetlands , methodo = c("model_1"),day_shoot)
df_wetlands_m1 = dataframe_ggplot(wetlands_m1)
ggplot(df_wetlands_m1 , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.8.2 Standart deviation

dataset_wetlands = dataset[dataset$CLASS_NAME == "Wetlands",]
wetlands_m2 = removing_outliers(dataset_wetlands , methodo = c("model_2"),day_shoot)
df_wetlands_m2 = dataframe_ggplot(wetlands_m2)
ggplot(df_wetlands_m2 , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.8.3 Summary

#graphic with the second cleaning technique
out_barchar = func_barchar(wetlands_m2)
ggplot(data = out_barchar , aes(x=time, y=value, fill = L1)) +
  geom_bar(stat="identity",position="dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

4. Multitemporal data

datafinal = list(query_trees_tr, query3_veg_asociation,query_agricultural_tr,temporalcrops_m3,non_vegetated_m2,water_m2,sealed_m2,wetlands_m2)

out= NULL
merge_list_class = function(x){
  for(i in 1:length(x)){
    df0 = cbind(names(x[i]),x[[i]])
    colnames(df0) = c("time","CLASS_NAME","NDVI", "geometry")
    out = rbind(df0,out)
  }
  return(out)
}

merge1 = lapply(datafinal,merge_list_class)
outfinal= NULL
for(i in 1:length(merge1)){
    outfinal = rbind(merge1[[i]],outfinal)
}

graphic

conteo_df = function(x){
  return(table(x$CLASS_NAME))
}

listdata = split(outfinal,outfinal$time)
conteo_list = lapply(listdata,conteo_df)
out_conteo = NULL
for(k in 1:length(conteo_list)){
  df0 = data.frame(conteo_list[[k]],time = names(conteo_list[k]))
  out_conteo = rbind(df0,out_conteo)
}
#class_names
factor_classes = unique(out_conteo$Var1)

#sum per class
fun_sum = function(x) sum(x$Freq)
conteo_total = lapply(split(out_conteo,out_conteo$time),fun_sum)
conteo_total = melt(conteo_total)

#defining position of the text
out_conteo$halffreq = out_conteo$Freq *0.5
out_conteo$pos = 1
for(i in unique(out_conteo$time)){
  ind1 = which(out_conteo$time == i)
  out1 = out_conteo[ind1,]
  out0 = 0
  id_conteo = which(conteo_total$L1 == i)
  for(j in factor_classes){
    ind2 = which(out1$Var1 == j)
    out0 = out0 + out1[ind2,"Freq"]
    out_conteo[ind1[ind2],"pos"] = conteo_total[id_conteo,"value"]  - (out0 - out_conteo[ind1[ind2],"halffreq"])
  }
}

ggplot graphic

out_conteo$Class = factor(out_conteo$Var1, factor_classes)
#colors in the graphic
colors_classes = c("green4", "green3", "green1", "springgreen3", "springgreen1", "darkolivegreen3", "turquoise","plum2","blue2")
names(colors_classes) = factor_classes

ggplot(data=out_conteo, aes(x=time, y=Freq, fill= Class)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_bar(stat="identity") +
  scale_fill_manual("Classes", values = colors_classes) +
  geom_text(aes(label = Freq,y = pos),size = 2)